– Regression (predict a continuous variable) with "wine quality" dataset
– Classification (predict category membership) with "breast cancer" dataset
– Classification (predict category membership) with "wine quality" dataset: Good and bad wine
10/1/2018
– Regression (predict a continuous variable) with "wine quality" dataset
– Classification (predict category membership) with "breast cancer" dataset
– Classification (predict category membership) with "wine quality" dataset: Good and bad wine
– Kaggle hosts machine learning competitions, data, and advice
– Wisconsin Cancer Diagnosis Data https://www.kaggle.com/uciml/breast-cancer-wisconsin-data
– Wine Quality Data https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009
http://topepo.github.io/caret/index.html
– Provides an integrated set of functions to support machine learning
– Provides uniform interface to over 200 algorithms (e.g., linear regression, random forest, support vector machines)
– Makes training and testing many types of models very easy
– Incorporates sensible defaults that often work well
– Partition data into training and test sets
– Pre-process data and select features
– Tune model hyperparameters with cross validation
– Estimate variable importance
– Assess predictions and model performance with test data
– Compare model performance
wine.df = read_csv("winequality-red.csv")
skim(wine.df)
## Skim summary statistics ## n obs: 1599 ## n variables: 12 ## ## ── Variable type:integer ───────────────────────────────────────────────────────────────────────────────────────────────────────────────── ## variable missing complete n mean sd p0 p25 p50 p75 p100 hist ## quality 0 1599 1599 5.64 0.81 3 5 6 6 8 ▁▁▁▇▇▁▂▁ ## ## ── Variable type:numeric ───────────────────────────────────────────────────────────────────────────────────────────────────────────────── ## variable missing complete n mean sd p0 p25 ## alcohol 0 1599 1599 10.42 1.07 8.4 9.5 ## chlorides 0 1599 1599 0.087 0.047 0.012 0.07 ## citric acid 0 1599 1599 0.27 0.19 0 0.09 ## density 0 1599 1599 1 0.0019 0.99 1 ## fixed acidity 0 1599 1599 8.32 1.74 4.6 7.1 ## free sulfur dioxide 0 1599 1599 15.87 10.46 1 7 ## pH 0 1599 1599 3.31 0.15 2.74 3.21 ## residual sugar 0 1599 1599 2.54 1.41 0.9 1.9 ## sulphates 0 1599 1599 0.66 0.17 0.33 0.55 ## total sulfur dioxide 0 1599 1599 46.47 32.9 6 22 ## volatile acidity 0 1599 1599 0.53 0.18 0.12 0.39 ## p50 p75 p100 hist ## 10.2 11.1 14.9 ▂▇▅▃▂▁▁▁ ## 0.079 0.09 0.61 ▇▃▁▁▁▁▁▁ ## 0.26 0.42 1 ▇▅▅▆▂▁▁▁ ## 1 1 1 ▁▁▃▇▇▂▁▁ ## 7.9 9.2 15.9 ▁▇▇▅▂▁▁▁ ## 14 21 72 ▇▇▅▂▁▁▁▁ ## 3.31 3.4 4.01 ▁▁▅▇▅▁▁▁ ## 2.2 2.6 15.5 ▇▂▁▁▁▁▁▁ ## 0.62 0.73 2 ▂▇▂▁▁▁▁▁ ## 38 62 289 ▇▅▂▁▁▁▁▁ ## 0.52 0.64 1.58 ▂▇▇▃▁▁▁▁
featurePlot(x = wine.df[, 2:11], y = wine.df$quality,
plot = "scatter",
type = c("p", "smooth"), span = .5,
layout = c(5, 2))
quality.wine.df = wine.df %>% mutate(goodwine = if_else(quality>5, "good", "bad")) %>% mutate(goodwine = as.factor(goodwine)) ggplot(quality.wine.df, aes(goodwine, quality, colour = goodwine, fill = goodwine))+ geom_point(size = .5, alpha = .7, position = position_jitter(height = 0.1))+ labs(x = "Discretized wine quality", y = "Rated wine quality")+ theme(legend.position = "none")
wine.df = quality.wine.df %>% select(-quality)
– Proportions of class variable—good and bad wine—should be similar
– Proportions of class variables should be similar in test and training data
– createDataPartition Creates partitions that maintains the class distribution
inTrain = createDataPartition(wine.df$goodwine, p = 3/4, list = FALSE) trainDescr = wine.df[inTrain, -12] # All but class variable testDescr = wine.df[-inTrain, -12] trainClass = wine.df$goodwine[inTrain] testClass = wine.df$goodwine[-inTrain]
wine.df$goodwine %>% table() %>% prop.table() %>% round(3)*100
## . ## bad good ## 46.5 53.5
trainClass %>% table() %>% prop.table() %>% round(3)*100
## . ## bad good ## 46.5 53.5
testClass %>% table() %>% prop.table() %>% round(3)*100
## . ## bad good ## 46.6 53.4
– Eliminate variables with no variabilty
– Eliminate highly correlated variables
– Select predictive features
– Engineer predictive features
– preProcess also supports other preprocessing methods, such as PCA and ICA
– center subtracts mean
– scale normalizes based on standard deviation
xTrans = preProcess(trainDescr, method = c("center", "scale"))
trainScaled = predict(xTrans, trainDescr)
testScaled = predict(xTrans, testDescr)
– Load and review the Wisconsin cancer data
cancer.df = read_csv("cancer.csv")
skim(cancer.df)
– Identify the diagnosis indicator
– Partition data
inTrain = createDataPartition(cancer.df$diagnosis, p = 3/4, list = FALSE)
trainDescr = cancer.df[inTrain, -(1:2)] # All but class variable
testDescr = cancer.df[-inTrain, -(1:2)]
trainClass = cancer.df$diagnosis[inTrain]
testClass = cancer.df$diagnosis[-inTrain]
– Pre-process data
xTrans = preProcess(trainDescr, method = c("center", "scale"))
trainScaled = predict(xTrans, trainDescr)
testScaled = predict(xTrans, testDescr)
– Used to select best combination of predictors and model parameters
– Estimates model performance (e.g., AUC or r-square) for each candidate model
– Uses a random subset of the training data to train the model and a withheld subset to test
sleep.df = sleepstudy
folds <- groupKFold(sleep.df$Subject, k = 18)
– Select cross validation method: 10-fold repeated cross validation is common
– Define hyperparameter selection method: grid search is the simplest approach
– Define summary measures
– trainControl command specifies all these parameters in a single statement
trainControltrain.control = trainControl(method = "repeatedcv",
number = 10, repeats = 3, # number: number of folds
search = "grid", # for tuning hyperparameters
classProbs = TRUE,
savePredictions = "final",
summaryFunction = twoClassSummary)
– Over 200 different models from 50 categories (e.g., Linear regression, boosting, bagging, cost sensitive learning)
– List of models: http://caret.r-forge.r-project.org/modelList.html
– The "train" statement can train any of them
– Here we select three:
- Logistic regression - Support vector machine - Xgboost, a boosted random forest that performs well in many situations
train function– Specify class and predictor variables
– Specify one of the over 200 models (e.g., xgboost)
– Specify the metric, such as ROC
– Include the train control specified earlier
– Logistic regression has no tuning parameters
– 10-fold repeated (3 times) cross-validation occurs once
– Produces a total of 30 instances of model fitting and testing
– Cross validation provides a nearly unbiased estimate of the performance of the model on the held out data
glm.fit = train(x = trainScaled, y = trainClass, method = 'glm', metric = "ROC", trControl = train.control) glm.fit
## Generalized Linear Model ## ## 1200 samples ## 11 predictor ## 2 classes: 'bad', 'good' ## ## No pre-processing ## Resampling: Cross-Validated (10 fold, repeated 3 times) ## Summary of sample sizes: 1080, 1079, 1081, 1081, 1080, 1079, ... ## Resampling results: ## ## ROC Sens Spec ## 0.8177253 0.7358874 0.7637821
Linear support vector machines have a single tuning parameter–C
C (Cost)
C = 1000 "hard margin" tends to be sensitive to individual data points and is prone to over fitting
https://stackoverflow.com/questions/4629505/svm-hard-or-soft-margins
grid = expand.grid(C = c(.1, .2, .4, 1, 2, 4)) svm.fit = train(x = trainScaled, y = trainClass, method = "svmLinear", metric = "ROC", tuneGrid = grid, # Overrides tuneLength tuneLength = 3, # Number of levels of each hyper parameter, unless specified by grid trControl = train.control, scaled = TRUE) plot(svm.fit)
Classification depends on adding outcomes across many trees
Chen, T., & Guestrin, C. (2016). XGBoost: A scalable tree boosting system. Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, 785–794.
Trees are built in sequence to address the errors (residuals) of the previous trees
– nrounds (# Boosting Iterations)–model robustness
– max_depth (Max Tree Depth)–model complexity
– eta (Shrinkage)–model robustness
– gamma (Minimum Loss Reduction)–model complexity
– colsample_bytree (Subsample Ratio of Columns)–model robustness
– min_child_weight (Minimum Sum of Instance Weight)–model complexity
– subsample (Subsample Percentage)–model robustness
A grid search with 3 levels for each parameter produces 3^7 combinations!
tuneLength = 3 produces
- nrounds (# Boosting Iterations) (50 100 150) - max_depth (Max Tree Depth) (1, 2, 3) - eta (Shrinkage) (.3, .4) - gamma (Minimum Loss Reduction) (0) - colsample_bytree (Subsample Ratio of Columns) (.6, .8) - min_child_weight (Minimum Sum of Instance Weight) (1) - subsample (Subsample Percentage) (.50, .75, 1.0)
– 108 different model combinations each trained and tested 10X3 times
xgb.fit = train(x = trainScaled, y = trainClass, method = "xgbTree", metric = "ROC", tuneLength = 3, # Depends on number of parameters in algorithm trControl = train.control, scaled = TRUE)
– Set training parameters
train.control =
trainControl(method = "repeatedcv",
number = 10, repeats = 3, # number: number of folds
search = "grid", # for tuning hyperparameters
classProbs = TRUE,
savePredictions = "final",
summaryFunction = twoClassSummary)
– Train an extreme boosting tree
xgb.fit = train(x = trainScaled, y = trainClass,
method = "xgbTree", metric = "ROC",
tuneLength = 3, # Depends on number of parameters in algorithm
trControl = train.control, scaled = TRUE)
– Identify the best combination of Hyperparameters
glm.pred = predict(glm.fit, testScaled) confusionMatrix(glm.pred, testClass)
## Confusion Matrix and Statistics ## ## Reference ## Prediction bad good ## bad 137 48 ## good 49 165 ## ## Accuracy : 0.7569 ## 95% CI : (0.7117, 0.7982) ## No Information Rate : 0.5338 ## P-Value [Acc > NIR] : <2e-16 ## ## Kappa : 0.5114 ## Mcnemar's Test P-Value : 1 ## ## Sensitivity : 0.7366 ## Specificity : 0.7746 ## Pos Pred Value : 0.7405 ## Neg Pred Value : 0.7710 ## Prevalence : 0.4662 ## Detection Rate : 0.3434 ## Detection Prevalence : 0.4637 ## Balanced Accuracy : 0.7556 ## ## 'Positive' Class : bad ##
svm.pred = predict(svm.fit, testScaled) confusionMatrix(svm.pred, testClass)
## Confusion Matrix and Statistics ## ## Reference ## Prediction bad good ## bad 137 47 ## good 49 166 ## ## Accuracy : 0.7594 ## 95% CI : (0.7144, 0.8005) ## No Information Rate : 0.5338 ## P-Value [Acc > NIR] : <2e-16 ## ## Kappa : 0.5163 ## Mcnemar's Test P-Value : 0.9187 ## ## Sensitivity : 0.7366 ## Specificity : 0.7793 ## Pos Pred Value : 0.7446 ## Neg Pred Value : 0.7721 ## Prevalence : 0.4662 ## Detection Rate : 0.3434 ## Detection Prevalence : 0.4612 ## Balanced Accuracy : 0.7580 ## ## 'Positive' Class : bad ##
xgb.pred = predict(xgb.fit, testScaled) confusionMatrix(xgb.pred, testClass)
## Confusion Matrix and Statistics ## ## Reference ## Prediction bad good ## bad 160 20 ## good 26 193 ## ## Accuracy : 0.8847 ## 95% CI : (0.8492, 0.9143) ## No Information Rate : 0.5338 ## P-Value [Acc > NIR] : <2e-16 ## ## Kappa : 0.7679 ## Mcnemar's Test P-Value : 0.461 ## ## Sensitivity : 0.8602 ## Specificity : 0.9061 ## Pos Pred Value : 0.8889 ## Neg Pred Value : 0.8813 ## Prevalence : 0.4662 ## Detection Rate : 0.4010 ## Detection Prevalence : 0.4511 ## Balanced Accuracy : 0.8832 ## ## 'Positive' Class : bad ##
mod.resamps = resamples(list(glm = glm.fit, svm = svm.fit, xgb = xgb.fit)) bwplot(mod.resamps, metric="ROC")
# dotplot(mod.resamps, metric="ROC")
– Assess model performance
xgb.pred = predict(xgb.fit, testScaled)
confusionMatrix(xgb.pred, testClass)
– Assess variable importance
plot(varImp(xgb.fit, scale = TRUE))
library(FFTrees)
wine.df = read_csv("winequality-red.csv")
wine.df = wine.df %>% mutate(goodwine = if_else(quality>5, TRUE, FALSE)) %>%
select(-quality)
inTrain = createDataPartition(wine.df$goodwine, p = 3/4, list = FALSE)
train.wine.df = wine.df[inTrain, ]
test.wine.df = wine.df[-inTrain, ]
fft.fit = FFTrees(formula = goodwine~., data = train.wine.df, do.comp = FALSE)
– Similar process different performance metrics
- RMSE--Root mean square error - MAE--Mean absolute error
– General issues with model metrics
- How to penalize the model for large deviations? - Does the sign of the error matter? - How to define and ensure fair algorithms? - Similar to the issue with classificaiton: Are misses and false alarms equally problematic?
– Cost sensitive learning and optimal \(\beta\)
– Partition data into training and test sets
– Pre-process data and select features
– Tune model hyperparameters with cross validation
– Estimate variable importance
– Assess predictions and model performance with test data
– Compare model performance
At each step be sure to model with people in mind